package au.com.acpfg.misc.biojava; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.xml.datatype.DatatypeConstants; import org.biojava.bio.seq.DNATools; import org.biojava.bio.seq.RNATools; import org.biojava.bio.symbol.SymbolList; import org.knime.core.data.DataCell; import org.knime.core.data.DataColumnSpec; import org.knime.core.data.DataColumnSpecCreator; import org.knime.core.data.DataRow; import org.knime.core.data.DataTableSpec; import org.knime.core.data.DataType; import org.knime.core.data.RowIterator; import org.knime.core.data.collection.CollectionCellFactory; import org.knime.core.data.collection.ListCell; import org.knime.core.data.def.DefaultRow; import org.knime.core.data.def.StringCell; import org.knime.core.data.vector.bitvector.DenseBitVector; import org.knime.core.node.BufferedDataContainer; import org.knime.core.node.BufferedDataTable; import org.knime.core.node.ExecutionContext; import org.knime.core.node.NodeLogger; public class TrypticPeptideExtractor implements BioJavaProcessorInterface { /** * this is the maximum measurable peptide length (in numbers of AA's) which is * likely to be measured on the available instrumentation. Due to the mass range limitation * of most mass spectrometers, this is likely to be a reasonable limit on tryptic peptides */ public static final int MAX_MEASURABLE_PEPTIDE_LENGTH_IN_AA = 30; // HACK: static codon table - ok for my needs protected static HashMap<String,String> aa_map = new HashMap<String,String>(); static { aa_map.put("TTT", "F"); aa_map.put("TTC", "F"); aa_map.put("TTA", "L"); aa_map.put("TTG", "L"); // standard DNA codon table (from wikipedia) aa_map.put("CTT", "L"); aa_map.put("CTC", "L"); aa_map.put("CTA", "L"); aa_map.put("CTG", "L"); aa_map.put("ATT", "I"); aa_map.put("ATC", "I"); aa_map.put("ATA", "I"); aa_map.put("ATG", "M"); aa_map.put("GTT", "V"); aa_map.put("GTC", "V"); aa_map.put("GTA", "V"); aa_map.put("GTG", "V"); aa_map.put("TCT", "S"); aa_map.put("TCC", "S"); aa_map.put("TCA", "S"); aa_map.put("TCG", "S"); aa_map.put("CCT", "P"); aa_map.put("CCC", "P"); aa_map.put("CCA", "P"); aa_map.put("CCG", "P"); aa_map.put("ACT", "T"); aa_map.put("ACC", "T"); aa_map.put("ACA", "T"); aa_map.put("ACG", "T"); aa_map.put("GCT", "A"); aa_map.put("GCC", "A"); aa_map.put("GCA", "A"); aa_map.put("GCG", "A"); aa_map.put("TAT", "Y"); aa_map.put("TAC", "Y"); aa_map.put("TAA", "*"); aa_map.put("TAG", "*"); aa_map.put("CAT", "H"); aa_map.put("CAC", "H"); aa_map.put("CAA", "Q"); aa_map.put("CAG", "Q"); aa_map.put("AAT", "N"); aa_map.put("AAC", "N"); aa_map.put("AAA", "K"); aa_map.put("AAG", "K"); aa_map.put("GAT", "D"); aa_map.put("GAC", "D"); aa_map.put("GAA", "E"); aa_map.put("GAG", "E"); aa_map.put("TGT", "C"); aa_map.put("TGC", "C"); aa_map.put("TGA", "*"); aa_map.put("TGG", "W"); aa_map.put("CGT", "R"); aa_map.put("CGC", "R"); aa_map.put("CGA", "R"); aa_map.put("CGG", "R"); aa_map.put("AGT", "S"); aa_map.put("AGC", "S"); aa_map.put("AGA", "R"); aa_map.put("AGG", "R"); aa_map.put("GGT", "G"); aa_map.put("GGC", "G"); aa_map.put("GGA", "G"); aa_map.put("GGG", "G"); }; public TrypticPeptideExtractor(BioJavaProcessorNodeModel m, String task) { } @Override public void execute(BioJavaProcessorNodeModel mdl, ExecutionContext exec, NodeLogger l, BufferedDataTable[] inData, BufferedDataContainer c) throws Exception { // 1. scan the rows to get the available peptides (whether tryptic or not) int n_rows = inData[0].getRowCount(); double done = 0.0; RowIterator it = inData[0].iterator(); boolean is_prot = mdl.areSequencesProtein(); boolean is_dna = mdl.areSequencesDNA(); Pattern p = Pattern.compile("([KR][^KR]{6,17}[KR])[^P]"); while (it.hasNext()) { DataRow r = it.next(); String[] seqs = new String[6]; String[] na_seqs = new String[6]; // RNA reading frames if !is_prot (MUST be in the same frame order as seqs) String str = mdl.getSequence(r); if (str == null || str.length() < 1) continue; if (!is_prot) { SymbolList syms = mdl.getSequenceAsSymbol(str); for (int i=0; i<3; i++) { // take the reading frame SymbolList rf = syms.subList(i+1, syms.length()-(syms.length() - i) % 3); // if it is DNA transcribe it to RNA first if (is_dna) { rf = DNATools.toRNA(rf); } if (!is_prot) { na_seqs[i+3] = rf.seqString().toUpperCase(); } SymbolList prot = RNATools.translate(rf); seqs[i+3] = prot.seqString(); // reverse frame translation rf = RNATools.reverseComplement(rf); prot = RNATools.translate(rf); seqs[i] = prot.seqString(); if (!is_prot) { na_seqs[i] = rf.seqString().toUpperCase(); } } } else { // only one sequence as there is no frame translation involved seqs = new String[] { mdl.getSequence(r).toUpperCase() }; } // process the available protein sequences looking for tryptic peptides DataCell[] cells = new DataCell[5]; cells[0] = new StringCell(mdl.getSequence(r)); cells[1] = DataType.getMissingCell(); cells[2] = DataType.getMissingCell(); cells[3] = DataType.getMissingCell(); cells[4] = DataType.getMissingCell(); // 1. compute the fully tryptic in-silico peptides present in the available sequence data HashSet<String> unambg_tryptics = new HashSet<String>(); // peptides without any IUPAC base call in them (where known) HashSet<String> ambg_tryptics = new HashSet<String>(); // peptides which have one or more IUPAC base calls HashSet<String> weak_tryptics = new HashSet<String>(); // peptides which have ambiguity at the terminii (are we sure they are K/R)? int idx = 0; for (String seq : seqs) { Matcher m = p.matcher(seq); while (m.find()) { String pep = m.group(1); int start_pos = m.start(1); // skip peptides if they are too long to be reliably measured if (pep.length() > MAX_MEASURABLE_PEPTIDE_LENGTH_IN_AA) continue; // peptide have an unknown residue due to ambiguous (IUPAC) base call at DNA/RNA level? if (pep.indexOf('X') >= 0) { addAmbiguousPeptides(ambg_tryptics, seq, is_prot ? null : na_seqs[idx], pep, start_pos); } else if (pep.indexOf('*') < 0) { unambg_tryptics.add(pep); } } idx++; } // 1a. if the K/R terminii have IUPAC codes then try to match that for (String na_seq : na_seqs) { DenseBitVector terminii_ambig_pos = new DenseBitVector(na_seq.length()); DenseBitVector terminii_unambig_pos = new DenseBitVector(na_seq.length()); for (int i=0; i<na_seq.length(); i += 3) { String codon = na_seq.substring(i, i+3); // NB: we dont have to match IUPAC code-free codons, since that has already been done above if (codon.matches("^CG[RMKWSBDHVN]") || codon.matches("^AA[RMKWSBDHVN]") || codon.matches("^AG[RMKWSBDHVN]")) { terminii_ambig_pos.set(i); } else if (codon.equals("AAA") || codon.equals("AAG") || codon.startsWith("CG") || codon.equals("AGA") || codon.equals("AGG")) { // K/R encoding positions vector terminii_unambig_pos.set(i); } } if (terminii_ambig_pos.cardinality() > 0) { // One of two cases is possible: // 1. the set bit corresponds to an IUPAC code marks the start of a tryptic peptide // 2. the set bit corresponds to an IUPAC code marks the end of a tryptic peptide // But the other terminii may or may not be ambiguous too.... long each_idx = 0; while ((each_idx = terminii_ambig_pos.nextSetBit(each_idx)) >= 0) { // case 1: tryptic peptide with IUPAC codon at both terminii long bit_idx = each_idx+1; while ((bit_idx = terminii_ambig_pos.nextSetBit(bit_idx)) >= 0) { long codon_idx = bit_idx - (bit_idx % 3); long each_codon_idx = each_idx - (each_idx % 3); long codons = (codon_idx - each_codon_idx) / 3 + 1; if (codons >= 8 && codons <= MAX_MEASURABLE_PEPTIDE_LENGTH_IN_AA) { addUnknownTerminiiPeptides(weak_tryptics, new StringBuffer(na_seq.substring((int)each_codon_idx, (int)(each_codon_idx+(codons*3)))), 0); } else if (codons > MAX_MEASURABLE_PEPTIDE_LENGTH_IN_AA) { break; } bit_idx++; } // case 1: tryptic peptide with IUPAC codon at start, but not at the end bit_idx = each_idx+1; while ((bit_idx = terminii_unambig_pos.nextSetBit(bit_idx)) >= 0) { long codon_idx = bit_idx - (bit_idx % 3); long each_codon_idx = each_idx - (each_idx % 3); long codons = (codon_idx - each_codon_idx) / 3 + 1; if (codons >= 8 && codons <= MAX_MEASURABLE_PEPTIDE_LENGTH_IN_AA) { addUnknownTerminiiPeptides(weak_tryptics, new StringBuffer(na_seq.substring((int)each_codon_idx, (int)(each_codon_idx+(codons*3)))), 0); } else if (codons > MAX_MEASURABLE_PEPTIDE_LENGTH_IN_AA) { break; } bit_idx++; } // case 2: tryptic peptide with IUPAC codon at end bit_idx = each_idx - (each_idx%3) - (MAX_MEASURABLE_PEPTIDE_LENGTH_IN_AA*3); if (bit_idx < 0) bit_idx = 0; while (bit_idx < each_idx && ((bit_idx = terminii_ambig_pos.nextSetBit(bit_idx)) >= 0)) { long codon_idx = bit_idx - (bit_idx % 3); long each_codon_idx = each_idx - (each_idx % 3); long codons = (each_codon_idx - codon_idx) / 3 + 1; if (codons >= 8 && codons <= MAX_MEASURABLE_PEPTIDE_LENGTH_IN_AA) { addUnknownTerminiiPeptides(weak_tryptics, new StringBuffer(na_seq.substring((int)codon_idx, (int)(codon_idx+(codons*3)))), 0); } else if (codons < 8) { break; } bit_idx++; } // case 2: tryptic peptide with non-ambiguous codon at start bit_idx = each_idx - (each_idx%3) - (MAX_MEASURABLE_PEPTIDE_LENGTH_IN_AA*3); if (bit_idx < 0) bit_idx = 0; while (bit_idx < each_idx && ((bit_idx = terminii_unambig_pos.nextSetBit(bit_idx)) >= 0)) { long codon_idx = bit_idx - (bit_idx % 3); long each_codon_idx = each_idx - (each_idx % 3); long codons = (each_codon_idx - codon_idx) / 3 + 1; if (codons >= 8 && codons <= MAX_MEASURABLE_PEPTIDE_LENGTH_IN_AA) { addUnknownTerminiiPeptides(weak_tryptics, new StringBuffer(na_seq.substring((int)codon_idx, (int)(codon_idx+(codons*3)))), 0); } else if (codons < 8) { break; } bit_idx++; } // next ambiguous base call each_idx++; } } } // ambig_tryptics & unambig_tryptics are now fully populated... so... boolean do_missed_cleaves = false; if (unambg_tryptics.size() > 0 || ambg_tryptics.size() > 0) { cells[1] = populate_cell(unambg_tryptics, true); cells[2] = populate_cell(ambg_tryptics, true); do_missed_cleaves = true; } // 2. avoid doing expensive missed cleave computation whenever possible if (do_missed_cleaves) { HashSet<String> mc_tryptics = new HashSet<String>(); HashSet<String> all_tryptics = new HashSet<String>(); all_tryptics.addAll(unambg_tryptics); all_tryptics.addAll(ambg_tryptics); all_tryptics.addAll(weak_tryptics); for (String s2 : seqs) { for (String tryptic : all_tryptics) { Pattern p2 = Pattern.compile("([KR]"+tryptic+")"); Matcher m = p2.matcher(s2); // single missed cleavage at either terminii while (m.find()) { mc_tryptics.add(m.group(1)); } p2 = Pattern.compile("("+tryptic+"[KR])"); m = p2.matcher(s2); while (m.find()) { mc_tryptics.add(m.group(1)); } // accept a "missed" cleavage involving a proline after [KR] at N-terminus // provided total length is ok (ie. likely to be measured by Mass Spec.) p2 = Pattern.compile("("+tryptic+"[^KR]{1,17}[KR])"); m = p2.matcher(s2); while (m.find()) { String pep = m.group(1); if (pep.length() < MAX_MEASURABLE_PEPTIDE_LENGTH_IN_AA) { mc_tryptics.add(pep); } } p2 = Pattern.compile("([KR][^KR]{1,17}"+tryptic+")"); m = p2.matcher(s2); while (m.find()) { String pep = m.group(1); if (pep.length() < MAX_MEASURABLE_PEPTIDE_LENGTH_IN_AA) { mc_tryptics.add(pep); } } } } if (mc_tryptics.size() > 0) { cells[3] = populate_cell(mc_tryptics, true); } if (weak_tryptics.size() > 0) { cells[4] = populate_cell(weak_tryptics, true); } } c.addRowToTable(new DefaultRow(r.getKey(), cells)); done++; if (((int)done) % 100 == 0) { exec.checkCanceled(); exec.setProgress(done / n_rows); } } } protected DataCell populate_cell(HashSet<String> tryptics, boolean no_stop_codons) { if (tryptics.size() < 1) { return DataType.getMissingCell(); } ArrayList<StringCell> tryptic_cells = new ArrayList<StringCell>(tryptics.size()); for (String cell_str : tryptics) { if (!no_stop_codons || cell_str.indexOf("*") < 0) tryptic_cells.add(new StringCell(cell_str)); } return CollectionCellFactory.createListCell(tryptic_cells); } /** * Similar to addAmbiguousPeptides() this method handles ambiguity in the first/last codon of the peptide * @param ambg_tryptics * @param codons * @param sb * @param start_pos */ protected void addUnknownTerminiiPeptides(HashSet<String> ambg_tryptics, StringBuffer codons, int start_pos) throws InvalidIUPACException { if (start_pos == 0) { codons = new StringBuffer(codons.toString().replaceAll("U", "T")); String codon = codons.substring(0, 3); if (codon.startsWith("AA")) { codons.setCharAt(2, 'A'); addUnknownTerminiiPeptides(ambg_tryptics, codons, start_pos+3); return; } else if (codon.startsWith("CG") || codon.startsWith("AG")) { codons.setCharAt(2, 'G'); addUnknownTerminiiPeptides(ambg_tryptics, codons, start_pos+3); return; } else { throw new InvalidIUPACException("Expected K/R encoding codon at start, found "+codon+" in "+codons); } } recurse(ambg_tryptics, codons, start_pos, codons.length()); } /** * A key method for calculation of the tryptics - it is only called when an unknown (X) * AA residue is known to exist in the current peptide. This routine is responsible for * examining the IUPAC code(s) that are the cause of the X and add possible variants * to the tryptics set. The nucleotide sequence, <code>na_seq</code> which was frame translated * from <code>seq</code> is also available, it is up to the implementation to decide if it should * use it or not. <code>pep</code> is the peptide with the unknown residue, <code>start_pos</code> * is the position in <code>seq</code> where the peptide starts (if the peptide occurs * multiple times in a given sequence, there will be separate calls of the method for each) * * @param tryptics * @param seq Always amino acid sequence. Some sort of translation from <code>na_seq</code> * @param na_seq Only non-null if the input sequence is DNA/RNA. * @param pep * @param start_pos */ protected void addAmbiguousPeptides(HashSet<String> tryptics, String seq, String na_seq, String pep, int start_pos) throws InvalidIUPACException { int x_count = 0; for (int i=0; i<pep.length(); i++) { if (pep.charAt(i) == 'X') x_count++; } na_seq = na_seq.replaceAll("U", "T"); // if nucleotide sequence not available, then nothing that can be done to resolve the ambiguity... so... if (na_seq == null) return; // otherwise look for IUPAC codes in the codons corresponding to the peptide and // try to perform all permutations int spos = start_pos * 3; StringBuffer na_codons = new StringBuffer(na_seq.substring(spos, spos+pep.length()*3)); assert(na_codons.length() % 3 == 0); recurse(tryptics, na_codons, 0, na_codons.length()); } protected void recurse(HashSet<String> tryptics, StringBuffer na_codons, int i, int length) throws InvalidIUPACException { if (i >= length) { // translate na_codons to AA and then add it since we've resolve IUPAC codes along the entire length StringBuffer pep = new StringBuffer(); for (int j=0; j<na_codons.length(); j+=3) { String key = ""+na_codons.charAt(j)+na_codons.charAt(j+1)+na_codons.charAt(j+2); String aa = aa_map.get(key.toUpperCase()); if (aa == null) throw new InvalidIUPACException("Invalid codon: "+key.toUpperCase()); pep.append(aa); } // stop codon probably indicates an invalid peptide, so we dont add these to the list if (pep.indexOf("*") < 0) tryptics.add(pep.toString()); return; } char c = na_codons.charAt(i); if (c == 'A' || c == 'C' || c == 'T' || c == 'U' || c == 'G') { recurse(tryptics, na_codons, i+1, length); return; } // else need to resolve the code via recursion if (c == 'R') { na_codons.setCharAt(i, 'A'); recurse(tryptics, na_codons, i, length); na_codons.setCharAt(i, 'G'); recurse(tryptics, na_codons, i, length); } else if (c == 'Y') { na_codons.setCharAt(i, 'C'); recurse(tryptics, na_codons, i, length); na_codons.setCharAt(i, 'T'); recurse(tryptics, na_codons, i, length); // no need to worry about U } else if (c == 'M') { na_codons.setCharAt(i, 'C'); recurse(tryptics, na_codons, i, length); na_codons.setCharAt(i, 'A'); recurse(tryptics, na_codons, i, length); } else if (c == 'K') { na_codons.setCharAt(i, 'T'); recurse(tryptics, na_codons, i, length); na_codons.setCharAt(i, 'G'); recurse(tryptics, na_codons, i, length); // no need to worry about U } else if (c == 'W') { na_codons.setCharAt(i, 'T'); recurse(tryptics, na_codons, i, length); na_codons.setCharAt(i, 'A'); recurse(tryptics, na_codons, i, length); // no need to worry about U } else if (c == 'S') { na_codons.setCharAt(i, 'C'); recurse(tryptics, na_codons, i, length); na_codons.setCharAt(i, 'G'); recurse(tryptics, na_codons, i, length); } else if (c == 'B') { na_codons.setCharAt(i, 'T'); recurse(tryptics, na_codons, i, length); na_codons.setCharAt(i, 'C'); recurse(tryptics, na_codons, i, length); na_codons.setCharAt(i, 'G'); recurse(tryptics, na_codons, i, length); // no need to worry about U } else if (c == 'D') { na_codons.setCharAt(i, 'A'); recurse(tryptics, na_codons, i, length); na_codons.setCharAt(i, 'T'); recurse(tryptics, na_codons, i, length); na_codons.setCharAt(i, 'G'); recurse(tryptics, na_codons, i, length); // no need to worry about U } else if (c == 'H') { na_codons.setCharAt(i, 'A'); recurse(tryptics, na_codons, i, length); na_codons.setCharAt(i, 'T'); recurse(tryptics, na_codons, i, length); na_codons.setCharAt(i, 'C'); recurse(tryptics, na_codons, i, length); // no need to worry about U } else if (c == 'V') { na_codons.setCharAt(i, 'A'); recurse(tryptics, na_codons, i, length); na_codons.setCharAt(i, 'C'); recurse(tryptics, na_codons, i, length); na_codons.setCharAt(i, 'G'); recurse(tryptics, na_codons, i, length); // no need to worry about U } else if (c == 'N') { for (char c2 : new char[] { 'A', 'C', 'G', 'T'} ) { na_codons.setCharAt(i, c2); recurse(tryptics, na_codons, i, length); } } else { throw new InvalidIUPACException("Unknown IUPAC code: "+c); } } @Override public DataTableSpec get_table_spec() { DataColumnSpec[] cols = new DataColumnSpec[5]; cols[0] = new DataColumnSpecCreator("Input Sequence", StringCell.TYPE).createSpec(); cols[1] = new DataColumnSpecCreator("Unambiguous Tryptic Peptides", ListCell.getCollectionType(StringCell.TYPE) ).createSpec(); cols[2] = new DataColumnSpecCreator("Ambiguous Tryptic Peptides", ListCell.getCollectionType(StringCell.TYPE)).createSpec(); cols[3] = new DataColumnSpecCreator("Single missed cleavage Tryptic Peptides", ListCell.getCollectionType(StringCell.TYPE)).createSpec(); cols[4] = new DataColumnSpecCreator("Weak Ambiguous Tryptic Peptides", ListCell.getCollectionType(StringCell.TYPE)).createSpec(); return new DataTableSpec(cols); } @Override public boolean isMerged() { return false; } }